The following libraries are used in for in this notebook:
# Load libraries
library(tidyverse)
library(tidymodels)
library(readr)
library(glmnet)
library(leaps)
library(naniar)
# Read csv with listing information
data <- read_csv(gzfile("listings.csv.gz"))
38 parsing failures.
row col expected actual file
15026 license 1/0/T/F/TRUE/FALSE NL857416819B01 <connection>
15673 license 1/0/T/F/TRUE/FALSE NL825517485B01 <connection>
15961 license 1/0/T/F/TRUE/FALSE NL825517485B01 <connection>
16391 license 1/0/T/F/TRUE/FALSE 855596338B01 <connection>
16823 license 1/0/T/F/TRUE/FALSE NL854649426B01 <connection>
..... ....... .................. .............. ............
See problems(...) for more details.
The outcomes of the model will be used for suggestions to the new hosts about the average platform price for similar listings. Then hosts can choose whether they want to use the recommendation to set their prices accordingly in order to be competitive and gain attention from the guests since the beginning. All variables including information on the reviews give information about a listing after it has been published. Therefore, this variables are not included in the data set. Moreover, the variables including a description and summary about the listing can be analyzed using NLP (e.g. sentiment analysis). However, this is beyond the scope of the assignment. Therefore, these variables are excluded from the model. The variables below are included in the data set for further analysis and cleaning.
# Generate subset with variables of interest
data_sub <- data %>%
select(id,
price,
property_type,
room_type,
accommodates,
bathrooms,
bedrooms,
beds,
bed_type,
amenities,
host_since,
host_response_time,
host_response_rate,
host_neighbourhood,
host_listings_count,
host_verifications,
host_identity_verified,
neighbourhood_cleansed,
square_feet,
cleaning_fee,
guests_included,
extra_people,
minimum_nights,
maximum_nights,
availability_30,
availability_60,
availability_90,
availability_365,
instant_bookable,
cancellation_policy,
require_guest_profile_picture,
require_guest_phone_verification,
calculated_host_listings_count,
calculated_host_listings_count_entire_homes,
calculated_host_listings_count_private_rooms,
calculated_host_listings_count_shared_rooms)
# Inspect data
head(data_sub)
# Inspect data
str(data_sub)
tibble [20,025 x 36] (S3: tbl_df/tbl/data.frame)
$ id : num [1:20025] 2818 20168 25428 27886 28871 ...
$ price : chr [1:20025] "$59.00" "$80.00" "$125.00" "$155.00" ...
$ property_type : chr [1:20025] "Apartment" "Townhouse" "Apartment" "Houseboat" ...
$ room_type : chr [1:20025] "Private room" "Private room" "Entire home/apt" "Private room" ...
$ accommodates : num [1:20025] 2 2 3 2 2 1 4 2 2 4 ...
$ bathrooms : num [1:20025] 1.5 1 1 1 1 1 1 1 1 1 ...
$ bedrooms : num [1:20025] 1 1 1 1 1 1 3 1 1 2 ...
$ beds : num [1:20025] 2 1 1 1 1 1 3 1 1 4 ...
$ bed_type : chr [1:20025] "Real Bed" "Real Bed" "Real Bed" "Real Bed" ...
$ amenities : chr [1:20025] "{Internet,Wifi,\"Paid parking off premises\",\"Buzzer/wireless intercom\",Heating,Washer,\"Smoke detector\",\"C"| __truncated__ "{TV,Internet,Wifi,\"Paid parking off premises\",Heating,\"Smoke detector\",\"Carbon monoxide detector\",\"Fire "| __truncated__ "{TV,\"Cable TV\",Internet,Wifi,Kitchen,Elevator,\"Indoor fireplace\",\"Buzzer/wireless intercom\",Heating,\"Fam"| __truncated__ "{TV,Internet,Wifi,Breakfast,Heating,\"Smoke detector\",\"Carbon monoxide detector\",\"Fire extinguisher\",Essen"| __truncated__ ...
$ host_since : Date[1:20025], format: "2008-09-24" "2009-12-02" "2009-11-20" "2010-03-23" ...
$ host_response_time : chr [1:20025] "within an hour" "within an hour" "within an hour" "within an hour" ...
$ host_response_rate : chr [1:20025] "100%" "100%" "100%" "100%" ...
$ host_neighbourhood : chr [1:20025] "Indische Buurt" "Grachtengordel" "Grachtengordel" "Westelijke Eilanden" ...
$ host_listings_count : num [1:20025] 1 2 2 1 3 3 1 1 NA 1 ...
$ host_verifications : chr [1:20025] "['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']" "['email', 'phone', 'reviews', 'jumio', 'offline_government_id', 'selfie', 'government_id', 'identity_manual']" "['email', 'phone', 'reviews']" "['email', 'phone', 'reviews', 'jumio']" ...
$ host_identity_verified : logi [1:20025] FALSE FALSE FALSE TRUE TRUE TRUE ...
$ neighbourhood_cleansed : chr [1:20025] "Oostelijk Havengebied - Indische Buurt" "Centrum-Oost" "Centrum-West" "Centrum-West" ...
$ square_feet : num [1:20025] NA NA NA NA NA NA NA NA NA NA ...
$ cleaning_fee : chr [1:20025] "$60.00" NA "$40.00" "$0.00" ...
$ guests_included : num [1:20025] 1 2 2 1 1 1 4 2 1 2 ...
$ extra_people : chr [1:20025] "$20.00" "$0.00" "$10.00" "$0.00" ...
$ minimum_nights : num [1:20025] 3 1 14 2 2 2 3 3 3 3 ...
$ maximum_nights : num [1:20025] 15 1000 60 730 1825 ...
$ availability_30 : num [1:20025] 0 4 2 16 0 0 0 6 13 0 ...
$ availability_60 : num [1:20025] 0 21 26 43 2 0 0 13 43 0 ...
$ availability_90 : num [1:20025] 0 51 26 69 10 0 0 23 67 0 ...
$ availability_365 : num [1:20025] 0 198 71 251 134 0 0 34 121 0 ...
$ instant_bookable : logi [1:20025] TRUE FALSE FALSE TRUE FALSE FALSE ...
$ cancellation_policy : chr [1:20025] "strict_14_with_grace_period" "strict_14_with_grace_period" "strict_14_with_grace_period" "strict_14_with_grace_period" ...
$ require_guest_profile_picture : logi [1:20025] FALSE FALSE FALSE FALSE FALSE FALSE ...
$ require_guest_phone_verification : logi [1:20025] FALSE FALSE FALSE FALSE FALSE FALSE ...
$ calculated_host_listings_count : num [1:20025] 1 2 2 1 3 3 1 1 2 1 ...
$ calculated_host_listings_count_entire_homes : num [1:20025] 0 0 2 0 0 0 1 1 0 1 ...
$ calculated_host_listings_count_private_rooms: num [1:20025] 1 2 0 1 3 3 0 0 0 0 ...
$ calculated_host_listings_count_shared_rooms : num [1:20025] 0 0 0 0 0 0 0 0 0 0 ...
- attr(*, "problems")= tibble [38 x 5] (S3: tbl_df/tbl/data.frame)
..$ row : int [1:38] 15026 15673 15961 16391 16823 16994 16995 17660 17661 17754 ...
..$ col : chr [1:38] "license" "license" "license" "license" ...
..$ expected: chr [1:38] "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" "1/0/T/F/TRUE/FALSE" ...
..$ actual : chr [1:38] "NL857416819B01" "NL825517485B01" "NL825517485B01" "855596338B01" ...
..$ file : chr [1:38] "<connection>" "<connection>" "<connection>" "<connection>" ...
# Convert columns to factors
data_sub$property_type <- factor(data_sub$property_type ,
levels = unique(data_sub$property_type))
data_sub$room_type <- factor(data_sub$room_type ,
levels = unique(data_sub$room_type))
data_sub$bed_type <- factor(data_sub$bed_type ,
levels = unique(data_sub$bed_type))
data_sub$host_response_time <- factor(data_sub$ host_response_time,
levels = unique(data_sub$host_response_time))
data_sub$host_neighbourhood <- factor(data_sub$host_neighbourhood,
levels = unique(data_sub$host_neighbourhood))
data_sub$neighbourhood_cleansed <- factor(data_sub$neighbourhood_cleansed,
levels = unique(data_sub$neighbourhood_cleansed))
data_sub$cancellation_policy <- factor(data_sub$cancellation_policy ,
levels = unique(data_sub$cancellation_policy))
# Remove $ sign from columns containing prices and convert to doubles
data_sub$price <- as.double(gsub("[,$]", "", data_sub$price))
data_sub$cleaning_fee <- as.double(gsub("[,$]", "", data_sub$cleaning_fee))
data_sub$extra_people <- as.double(gsub("[,$]", "", data_sub$extra_people))
# Replace "N/A" values, remove % and convert to percentage
data_sub$host_response_rate <- na_if(data_sub$host_response_rate, "N/A")
data_sub$host_response_rate <-
as.double(gsub("[%]", "", data_sub$host_response_rate)) / 100
# Clean and split strings for amenities
# Returns a list with all unique values
clean_amenities <- function(x) {
subbed <- gsub('[{}\"]', "", tolower(x))
splitted <- str_split(subbed, ",")
clean <- sapply(splitted, function(x) str_trim(x, side = "both"))
return(clean)
}
# Clean amenities
data_sub$amenities_clean <-
sapply(data_sub$amenities, function(x) clean_amenities(x))
# Create vector with all unique amenities
amenities_unique = c()
for(amenities in data_sub$amenities_clean) {
for(element in amenities) {
if(!(element %in% amenities_unique) & element != "") {
amenities_unique <- append(amenities_unique, str_trim(element))
}
}
}
# Show result
amenities_unique
[1] "internet" "wifi" "paid parking off premises" "buzzer/wireless intercom"
[5] "heating" "washer" "smoke detector" "carbon monoxide detector"
[9] "first aid kit" "safety card" "fire extinguisher" "essentials"
[13] "shampoo" "lock on bedroom door" "24-hour check-in" "hangers"
[17] "hair dryer" "iron" "laptop friendly workspace" "translation missing: en.hosting_amenity_49"
[21] "translation missing: en.hosting_amenity_50" "private entrance" "hot water" "bed linens"
[25] "extra pillows and blankets" "single level home" "garden or backyard" "no stairs or steps to enter"
[29] "accessible-height bed" "host greets you" "handheld shower head" "paid parking on premises"
[33] "tv" "refrigerator" "long term stays allowed" "cable tv"
[37] "kitchen" "elevator" "indoor fireplace" "family/kid friendly"
[41] "dryer" "private living room" "well-lit path to entrance" "breakfast"
[45] "self check-in" "smart lock" "lake access" "pets live on this property"
[49] "cat(s)" "smoking allowed" "pets allowed" "microwave"
[53] "coffee maker" "dishwasher" "dishes and silverware" "cooking basics"
[57] "oven" "stove" "patio or balcony" "keypad"
[61] "luggage dropoff allowed" "baby bath" "bathtub" "babysitter recommendations"
[65] "beach essentials" "cleaning before checkout" "ev charger" "pack ’n play/travel crib"
[69] "high chair" "crib" "children’s books and toys" "room-darkening shades"
[73] "children’s dinnerware" "free street parking" "other" "extra space around bed"
[77] "wheelchair accessible" "pocket wifi" "wide hallways" "waterfront"
[81] "washer / dryer" "window guards" "air conditioning" "suitable for events"
[85] "free parking on premises" "lockbox" "wide entrance for guests" "bbq grill"
[89] "ground floor access" "dog(s)" "hot tub" "wide entrance"
[93] "accessible-height toilet" "wide entryway" "hot water kettle" "ethernet connection"
[97] "flat path to guest entrance" "wide doorway to guest bathroom" "wide clearance to shower" "toilet"
[101] "step-free shower" "gym" "outlet covers" "firm mattress"
[105] "changing table" "stair gates" "fireplace guards" "table corner guards"
[109] "game console" "baby monitor" "doorman" "building staff"
[113] "pool" "other pet(s)" "fixed grab bars for shower" "disabled parking spot"
[117] "electric profiling bed" "bathtub with bath chair" "shower gel" "fixed grab bars for toilet"
[121] "beachfront" "shower chair" "trash can" "ski-in/ski-out"
[125] "mobile hoist" "pool with pool hoist" "ceiling hoist" "full kitchen"
[129] "private bathroom" "air purifier" "bread maker" "roll-in shower with chair"
# Create variable for WIFI and add to data set
wifi <- vector()
for(i in 1:length(data_sub$amenities_clean)) {
if("wifi" %in% data_sub$amenities_clean[[i]] |
"internet" %in% data_sub$amenities_clean[[i]]) {
wifi[i] <- "yes"
} else {
wifi[i] <- "no"
}
}
data_sub$wifi <- wifi
data_sub$wifi <- factor(data_sub$wifi, levels = c("yes", "no"))
# Create variable for pool and add to data set
pool <- vector()
for(i in 1:length(data_sub$amenities_clean)) {
if("pool" %in% data_sub$amenities_clean[[i]]) {
pool[i] <- "yes"
} else {
pool[i] <- "no"
}
}
data_sub$pool <- pool
data_sub$pool <- factor(data_sub$pool, levels = c("yes", "no"))
# Create variable for hot_tub and add to data set
hot_tub <- vector()
for(i in 1:length(data_sub$amenities_clean)) {
if("hot tub" %in% data_sub$amenities_clean[[i]]) {
hot_tub[i] <- "yes"
} else {
hot_tub[i] <- "no"
}
}
data_sub$hot_tub <- hot_tub
data_sub$hot_tub <- factor(data_sub$hot_tub, levels = c("yes", "no"))
# Clean and split strings for host verification methods
# Returns a list with all unique values
clean_verficiations <- function(x) {
subbed <- gsub("\\[|\\]", "", tolower(x))
subbed_complete <- gsub("[']", "", subbed)
splitted <- str_split(subbed_complete, ",")
clean <- sapply(splitted, function(x) str_trim(x, side = "both"))
return(clean)
}
# Clean host_verifications
data_sub$host_verifications_clean <-
sapply(data_sub$host_verifications, function(x) clean_verficiations(x))
# Generate list with all unique host verification methods
verifications_unique = c()
for(verifications in data_sub$host_verifications_clean) {
for(element in verifications) {
if(!(element %in% verifications_unique) & element != "") {
verifications_unique <- append(verifications_unique, str_trim(element))
}
}
}
# Show result
verifications_unique
[1] "email" "phone" "reviews" "jumio" "offline_government_id" "selfie" "government_id" "identity_manual"
[9] "facebook" "work_email" "none" "google" "manual_offline" "manual_online" "sent_id" "kba"
[17] "weibo" "zhima_selfie" "sesame" "sesame_offline"
# Create variable for host email and add to data set
host_email <- vector()
for(i in 1:length(data_sub$host_verifications_clean)) {
if("email" %in% data_sub$host_verifications_clean[[i]]) {
host_email[i] <- "yes"
} else {
host_email[i] <- "no"
}
}
data_sub$host_email <- host_email
data_sub$host_email <- factor(data_sub$host_email, levels = c("yes", "no"))
# Create variable for phone and add to data set
host_phone <- vector()
for(i in 1:length(data_sub$host_verifications_clean)) {
if("phone" %in% data_sub$host_verifications_clean[[i]]) {
host_phone[i] <- "yes"
} else {
host_phone[i] <- "no"
}
}
data_sub$host_phone <- host_phone
data_sub$host_phone <- factor(data_sub$host_phone, levels = c("yes", "no"))
# Create variable for host facebook and add to data set
host_facebook <- vector()
for(i in 1:length(data_sub$host_verifications_clean)) {
if("facebook" %in% data_sub$host_verifications_clean[[i]]) {
host_facebook[i] <- "yes"
} else {
host_facebook[i] <- "no"
}
}
data_sub$host_facebook <- host_facebook
data_sub$host_facebook <-
factor(data_sub$host_facebook, levels = c("yes", "no"))
# Create variable for government id
host_government_id <- vector()
for(i in 1:length(data_sub$host_verifications_clean)) {
if("government_id" %in% data_sub$host_verifications_clean[[i]]) {
host_government_id[i] <- "yes"
} else {
host_government_id[i] <- "no"
}
}
data_sub$host_government_id <- host_government_id
data_sub$host_government_id <-
factor(data_sub$host_government_id, levels = c("yes", "no"))
# Create variable for hosts without verification and add to data set
host_verification_method <- vector()
for(i in 1:length(data_sub$host_verifications_clean)) {
if("none" %in% data_sub$host_verifications_clean[[i]]) {
host_verification_method[i] <- "no"
} else {
host_verification_method[i] <- "yes"
}
}
data_sub$host_verification_method <- host_verification_method
data_sub$host_verification_method <-
factor(data_sub$host_verification_method, levels = c("yes", "no"))
# Create new variable for active years host
data_sub <- data_sub %>%
mutate(host_years_active =
as.double(as.Date("2019-12-07") - host_since) / 365)
# Count missing cases per variable
na_counter <- sapply(data_sub, function(x) sum(is.na(x)))
vars <- colnames(data_sub)
# Extract all variables with NA-values
na_values <- tibble(variables = vars, na_count = na_counter)
na_values %>%
filter(na_count > 0) %>%
arrange(desc(na_count))
na_values
The table above shows the variables in the data set that contain missing values (in descending order). The table shows that the variable square_feet has \(19662\) missing cases, which is about \(98.19\%\). If we would deleted the missing cases, the data set will barely contain any data. Moreover, other methods for handling missing values like replacing NA-values with the mean or median would not be appropriate since the variables will be based on only \(1.81\%\) of the data. Therefore, square_feet is nog included in the final. The variable host_response_rate has \(9349\) missing cases, which is about \(46.69\%\). The variable host_neighbourhood has \(5972\), which is about \(29.82\%\). The variable cleaning_fee has \(3604\), which is about \(18.00\%\). The variables host_response_rate, host_neighbourhood and cleaning_fee do not have as many missing values as square_feet, however, the same reasoning applies. As a result, these variables are also excluded from the analyses.
# Compute incomplete rows
incomplete_rows <-
which(is.na(data_sub %>% select(host_response_time, host_listings_count,
host_identity_verified, beds, bedrooms,
host_years_active, bathrooms)))
unique <- length(unique(incomplete_rows))
unique
[1] 682
# Select variables for data set
variables_analysis <-
na_values %>%
filter(na_count <= 649) %>%
select(variables) %>%
pull(variables)
# Create final data set
data_semi_final <- data_sub %>% select(all_of(variables_analysis))
data_semi_final <- data_semi_final %>%
select(-c(amenities, amenities_clean, host_verifications,
host_verifications_clean))
data_final <- data_semi_final[complete.cases(data_semi_final), ]
# Create a train-split sets
set.seed(123)
data_split <- initial_split(data_final, prop = 0.7)
data_train <- training(data_split)
data_test <- testing(data_split)
# Generate 10-fold CV sets
set.seed(321)
data_folds <- vfold_cv(data_train, v = 10)
data_folds
# 10-fold cross-validation